In [1]:
import pandas as pd
import plotly.graph_objects as go
import os
In [2]:
# pull each state's data into one dataframe
data_folder = 'lib/state/'
files = os.listdir(data_folder)

agg_df = pd.concat([pd.read_csv(os.path.join(data_folder, file), 
                                header=None)
                    for file in files])
In [3]:
agg_df.columns = ['state', 'sex', 'year', 'name', 'count']
In [4]:
agg_df
Out[4]:
state sex year name count
0 AK F 1910 Mary 14
1 AK F 1910 Annie 12
2 AK F 1910 Anna 10
3 AK F 1910 Margaret 8
4 AK F 1910 Helen 7
... ... ... ... ... ...
28970 WY M 2022 Lane 5
28971 WY M 2022 Michael 5
28972 WY M 2022 Nicholas 5
28973 WY M 2022 River 5
28974 WY M 2022 Silas 5

6408041 rows × 5 columns

In [5]:
# define a function to get most common first letter for a state-year pair
def most_common_first_letter(s):
    s['name'] = s['name'].str[0]
    return s.groupby('name')['count'].sum().idxmax()
In [6]:
# create a dataframe that has the most common first letter for each state-year pair
df = agg_df.groupby(['state', 'year'])[['name', 'count']].apply(most_common_first_letter)
In [7]:
df = df.reset_index()
In [8]:
df = df.rename(columns={0 : 'firstLetter'})
In [9]:
df
Out[9]:
state year firstLetter
0 AK 1910 A
1 AK 1911 J
2 AK 1912 J
3 AK 1913 M
4 AK 1914 J
... ... ... ...
5758 WY 2018 A
5759 WY 2019 E
5760 WY 2020 A
5761 WY 2021 E
5762 WY 2022 E

5763 rows × 3 columns

In [10]:
# for each year and each most common letter that is represented by
# at least one state, take the most common baby name that year
# that starts with that letter

rep_names = dict()
years = df['year'].unique()
for year in years:
    year_list = []
    for letter in df[df['year'] == year]['firstLetter'].unique():
        matched_names = agg_df[(agg_df['year'] == year) & (agg_df['name'].str[0] == letter)]
        most_common_name = matched_names.groupby('name')['count'].sum().idxmax()
        year_list.append(most_common_name)
    rep_names[year] = year_list
In [11]:
fig = go.Figure(data=[go.Choropleth(locations=df[df['year']==1911]['state'], locationmode='USA-states', 
                                    z=df[df['year']==1911]['firstLetter'].apply(ord),
                                    colorscale='Viridis',
                                    zmin=64, zmax=90), 
                      go.Scattergeo(locationmode='USA-states',
                                    locations=df[df['year']==1911]['state'],
                                    text=df[df['year']==1911]['firstLetter'],
                                    textfont={'color' : 'White'},
                                    mode='text')],
                
                layout=go.Layout(width=900, height=600, 
                                 title={'text' : 
                                        'Most common first letter for baby names in each state per year',
                                        'x' : 0.5},
                                 geo=dict(scope='usa', projection_type='albers usa'), 
                                 updatemenus=[dict(type="buttons",
                                                   buttons=[dict(label="Play",
                                                                 method="animate",
                                                                 args=[None])])]),
                
                frames=[go.Frame(
                    data=[go.Choropleth(locations=df[df['year']==year]['state'],
                                       locationmode='USA-states',
                                       z=df[df['year']==year]['firstLetter'].apply(ord),
                                       colorscale='Viridis',
                                       zmin=64, zmax=90),
                          go.Scattergeo(locationmode='USA-states',
                                    locations=df[df['year']==year]['state'],
                                    text=df[df['year']==year]['firstLetter'],
                                    textfont={'color' : 'White'},
                                    mode='text')],
                    layout=go.Layout(width=900, height=600, 
                                     title={'text' : 
                                            f'Most common first letter for baby names in each state in {year}',
                                            'x' : 0.5},
                                     geo=dict(scope='usa', projection_type='albers usa'),
                                     annotations = [dict(
                                                    x=.3,
                                                    y=.95,
                                                    xanchor='left',
                                                    xref='paper',
                                                    yref='paper',
                                                    text='Most common name starting with these letters: ' + ', '.join(rep_names[year]),
                                                    showarrow = False
                                                )])  
                ) for year in years]
)

fig.show()